## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## corrplot 0.95 loaded
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## Loading required package: lattice
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## 'data.frame':    3000 obs. of  17 variables:
##  $ userid         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ date.crea      : chr  "9/17/2011" "1/17/2017" "5/14/2019" "11/27/2015" ...
##  $ score          : num  1.5 8.95 2.5 2.82 2.12 ...
##  $ n.matches      : int  11 56 13 32 21 14 10 9 6 31 ...
##  $ n.updates.photo: int  5 2 3 5 1 2 1 1 -1 2 ...
##  $ n.photos       : int  6 6 4 2 4 6 4 3 4 5 ...
##  $ last.connex    : chr  "10/7/2011" "1/31/2017" "6/17/2019" "1/15/2016" ...
##  $ last.up.photo  : chr  "10/2/2011" "2/3/2017" "6/19/2019" "12/9/2015" ...
##  $ last.pr.update : logi  NA NA NA NA NA NA ...
##  $ gender         : int  1 1 1 0 0 1 0 0 1 1 ...
##  $ sent.ana       : num  6.49 4.59 6.47 5.37 5.57 ...
##  $ length.prof    : num  0 20.7 31.4 0 38.5 ...
##  $ voyage         : int  0 0 0 0 0 0 1 1 0 1 ...
##  $ laugh          : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ photo.elevator : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ photo.beach    : int  0 1 1 1 0 0 1 0 0 0 ...
##  $ Country        : chr  "France" "Germany" "England" "France" ...

Subsetting the data to preform MCA and PCA:

# Removing date columns that won't be used for analysis
date_cols <- c("last.up.photo", "last.pr.update", "last.connex", "date.crea","Country","userid")
tinder <- tinder %>% 
  select(-all_of(date_cols))

# Define the qualitative variable names
qualitative <- c( "photo.beach", "photo.elevator", "laugh", "voyage", "gender")

# Select qualitative columns from the `tinder` DataFrame
tinder_qualitative <- tinder %>% 
  select(all_of(qualitative))

# Select quantitative columns from the `tinder` DataFrame
tinder_quantitative <- tinder %>%
  select(-all_of(qualitative))


head(tinder_quantitative)
##      score n.matches n.updates.photo n.photos sent.ana length.prof
## 1 1.495834        11               5        6 6.490446     0.00000
## 2 8.946863        56               2        6 4.589125    20.72286
## 3 2.496199        13               3        4 6.473182    31.39928
## 4 2.823579        32               5        2 5.368982     0.00000
## 5 2.117433        21               1        4 5.573949    38.51022
## 6 1.700014        14               2        6 5.464667    23.11221
tinder_qualitative_df <- as.data.frame(tinder_qualitative)
tinder_quantitative_df <- as.data.frame(tinder_quantitative)
# Convert columns to factors 
tinder_qualitative <- tinder_qualitative %>%
  mutate(across(everything(), as.factor))

Preforming a pca on the quantitative variables of the data set:

tinder_pca<-prcomp(tinder_quantitative,scale=TRUE)

Seeing the number of principle components needed to do analysis:

# Compute variance and proportion of variance explained
tinder_var <- tinder_pca$sdev^2
tinder_pve <- tinder_var / sum(tinder_var)

# Create elbow plot
plot(tinder_pve, 
     type = "b", 
     xlab = "# of components", 
     ylab = "% Variance explained", 
     ylim = c(0, 1), 
     main = "Elbow Plot of PCA", 
     pch = 19, col = "blue")

The elbow plot doesn’t show a clear inflection point. However, the variance explained by 2 principle components only sums to about 60%. This suggests that we shouldn’t reduce the dimensions of the dataset.

Preforming MCA:

# Perform MCA
head(tinder_qualitative)
##   photo.beach photo.elevator laugh voyage gender
## 1           0              0     0      0      1
## 2           1              0     0      0      1
## 3           1              0     0      0      1
## 4           1              0     0      0      0
## 5           0              0     1      0      0
## 6           0              0     0      0      1
tinder_mca <- MCA(tinder_qualitative, graph = FALSE)
#projecting components on pc 1 and 2
fviz_pca_var(tinder_mca, col.var = "red",xlab='PC 1',ylab='PC 2')

Here we several vectors whose magnitude projection onto an axis represents the contribution a factor has on a specific principle component. Here we see values such Gender_1 and photo.beach_1 having significance on PC 1 while having no bearing on PC 2. Vectors with smaller magnitudes aren’t significant in the graphed PC’s, but this doesn’t mean that they’re irrelevant…

To illustrate this we can print the contribution table:

variable_contributions<-tinder_mca$var$contrib

# Subsetting the first 2 pc's and converting to a df to print neatly
contribution_table <- as.data.frame(variable_contributions[, 1:5])
print(contribution_table, row.names = TRUE)
##                       Dim 1        Dim 2      Dim 3       Dim 4       Dim 5
## photo.beach_0     5.4946973  0.003399576  0.2758844  0.03539420  5.95022575
## photo.beach_1    26.5132483  0.016403777  1.3312091  0.17078561 28.71128346
## photo.elevator_0  3.5745436  0.097037421  0.3207350  0.59363478  9.51946387
## photo.elevator_1 18.7199073  0.508185580  1.6796910  3.10886904 49.85349168
## laugh_0           0.1354581  5.810312265  6.7772077  6.74330511  0.09904889
## laugh_1           0.5568318 23.884639684 27.8592884 27.71992373  0.40716348
## voyage_0          0.3289347  8.641332018  0.5184313 11.93898139  0.57345266
## voyage_1          1.1549812 30.342120694  1.8203566 41.92108505  2.01355183
## gender_0         21.6338805  0.003883415  0.1540573  0.03549821  0.20269683
## gender_1         20.7507929  1.062594971  3.6202096  0.13053197  0.01801013
## gender_2          1.1367242 29.630090600 55.6429296  7.60199090  2.65161141
fviz_contrib(tinder_mca, choice = "var", axes = 1:5)

This table shows the percentage of information contributed to each PC by each variable. Going off of the variable Gender_2, we can see that it composes ~1% of PC1 but 55% of pc2’s variance.

Looking more broadly, we can see the % of variance explained by each principle component. This essentially tells us how many pc’s are needed to model the original data. In this case, if we used 5 pc’s we could model the original data with up to 86% accuracy.

inertia<-as.data.frame(tinder_mca$eig)
print(inertia)
##       eigenvalue percentage of variance cumulative percentage of variance
## dim 1  0.2556924               21.30770                          21.30770
## dim 2  0.2040141               17.00117                          38.30887
## dim 3  0.1988266               16.56888                          54.87775
## dim 4  0.1967857               16.39880                          71.27655
## dim 5  0.1880749               15.67291                          86.94946
## dim 6  0.1566064               13.05054                         100.00000

The inertia of the eigenvalues/principle components are relatively similar to the pca.

# Assuming 'tinder_mca' contains your MCA analysis result
fviz_screeplot(tinder_mca, addlabels = TRUE, main = "Elbow Plot for MCA",ylim=c(0,30))

head((tinder_mca$ind$coord))
##        Dim 1      Dim 2       Dim 3     Dim 4      Dim 5
## 1  0.2610285 -0.3482267  0.09668760 0.0506516 -0.2076804
## 2  0.9319911 -0.3649160 -0.05365775 0.1045024  0.4905410
## 3  0.9319911 -0.3649160 -0.05365775 0.1045024  0.4905410
## 4  0.3439451 -0.3021685  0.09417473 0.1159881  0.4534615
## 5 -0.4208131  0.3288199 -0.41892582 0.7239216 -0.3249656
## 6  0.2610285 -0.3482267  0.09668760 0.0506516 -0.2076804

A factor mapping on the PC’s show us the significance of each factor on a specific principle component. In real life this tells us that having a laughing photo in your profile doesn’t significantly impact the score of the profile, whereas having a photo from a trip, or beach are more likely to impact profile score.

Factor mapping of qualitative variables: Plotting the points of individuals on a factorial plane:

# Plot individuals
fviz_mca_ind(tinder_mca, 
             repel = TRUE,        # Avoid text overlap
             col.ind = "blue",    # Color of individuals
             geom = "point",      # Show points for individuals
             title = "Individuals on the Factorial Plane")

Now using our principle components we will cluster the data. In this scenario, clustering will form groups consisting of profiles with similar characteristics.

Looking at the graph above, the data seems to be grouped into four groups. Due to the nature of this data, it seems that k-means,hierarchical clustering would be the best method to cluster this data.

Preforming K-means:

#calculating the cluster using kmeans
set.seed(321)
kmeans_qual <- kmeans(tinder_mca$ind$coord, centers = 4)

# Convert cluster assignments to factor (discrete variable)
kmeans_clusters <- factor(kmeans_qual$cluster)

# Visualize K-means clusters on the MCA factorial plane
fviz_mca_ind(tinder_mca, 
             col.ind = kmeans_clusters,    # Color points by clusters (discrete factor)
             palette = "jco",              # Choose color palette
             addEllipses = TRUE,           # Add ellipses for clusters
             repel = TRUE,                 # Avoid label overlap
             label = "none",               # Remove data labels
             arrow = FALSE,                # Remove arrows
             title = "K-means Clustering on MCA")

This grouping doesn’t look right, lets try hierarchical clustering…

# Perform hierarchical clustering (using Euclidean distance and complete linkage)
dist_mca <- dist(tinder_mca$ind$coord)  # Compute distance matrix
hclust_qual <- hclust(dist_mca, method = "complete")  # Perform hierarchical clustering

# Cut the dendrogram to create 4 clusters
hclust_clusters <- cutree(hclust_qual, k = 4)  # 

# Convert hierarchical cluster assignments to factor (discrete variable)
hclust_clusters_factor <- factor(hclust_clusters)

# Visualize hierarchical clustering on the MCA factorial plane
fviz_mca_ind(tinder_mca, 
             col.ind = hclust_clusters_factor,    # Color points by clusters
             palette = "jco",                     # Choose color palette
             addEllipses = TRUE,                  # Add ellipses for clusters
             repel = TRUE,                        # Avoid label overlap
             label = "none",                      # Remove data labels
             arrow = FALSE,                       # Remove arrows
             title = "Hierarchical Clustering on MCA")

This gives a similar result to the kmeans clustering. To get a better understanding on why the clustering is the way that it is, we can try plotting the clusters using 3 pc’s.

# Extract the first three dimensions from the MCA coordinates
mca_3d_coords <- tinder_mca$ind$coord[, 1:3]  # Assuming MCA has at least 3 dimensions

# Create a 3D scatter plot
plot <- plot_ly(
  x = ~mca_3d_coords[, 1], 
  y = ~mca_3d_coords[, 2], 
  z = ~mca_3d_coords[, 3], 
  type = "scatter3d", 
  mode = "markers",
  color = ~kmeans_clusters,  # Color by clusters
  colors = "Set2"           # Color palette
)

# Add layout
plot <- plot %>% layout(
  title = "3D K-means Clustering on MCA",
  scene = list(
    xaxis = list(title = "Dim 1"),
    yaxis = list(title = "Dim 2"),
    zaxis = list(title = "Dim 3")
  )
)

# Print the plot
plot

In this graph we see that in 3 dimensions that the data doesn’t follow the pattern that graphing in 2d followed. *Note this graph actually contains 3000 observations. Since the data of the MCA is formed by factors, there are many points that overlap.

Now lets analyse on the PCA:

Lets what quantitative variables are correlated:

 coor_plot<-ggcorr(
   data=tinder_quantitative,
   label=TRUE)
 print(coor_plot)

The only variables that have a significant relationship are the “n.matches” and the “score”. This is since score is in part calculated by using n matches.

Using the MCA we can visualize the contribution of each qualitative variable on the pc’s:

fviz_pca_var(
  tinder_pca,col.var = "contrib",gradient.cols=c("blue","purple","red"),rapel=TRUE
)

Here we can once again see the percentge of varience explained by the individual pc’s

inertia_pca<-as.data.frame(tinder_pca$eig)
print(inertia)
##       eigenvalue percentage of variance cumulative percentage of variance
## dim 1  0.2556924               21.30770                          21.30770
## dim 2  0.2040141               17.00117                          38.30887
## dim 3  0.1988266               16.56888                          54.87775
## dim 4  0.1967857               16.39880                          71.27655
## dim 5  0.1880749               15.67291                          86.94946
## dim 6  0.1566064               13.05054                         100.00000

To group users, we can plot them on a plane of pc’s. In doing this we can hopefully see a pattern between user characteristics and the number of matches they have.

Plotting People on a factorial plane:

# Plot individuals on the first two dimensions (Dim 1 and Dim 2)
fviz_pca_ind(
  tinder_pca,
  axes = c(1, 2),       # Dimensions to plot
  geom.ind = "point",   # Use points for individuals
  col.ind = tinder$n.matches,     # Color by the quality of representation (cos2)
  gradient.cols = c("yellow", "orange", "red"),  # Color gradient
  repel = TRUE          # Avoid overlapping labels
) +
ggtitle("Individuals on the Factorial Plane (Dim 1 vs Dim 2)")

Here we can see that there is a positive correlation between dim 1 and the user score which is represented by the color.

To get more information on users we can cluster them by all principle components.

Lets see how many clusters we should use an elbow plot:

str(tinder_pca)
## List of 5
##  $ sdev    : num [1:6] 1.531 1.026 0.981 0.935 0.819 ...
##  $ rotation: num [1:6, 1:6] 0.5989 0.6116 0.3174 0.0032 0.4068 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:6] "score" "n.matches" "n.updates.photo" "n.photos" ...
##   .. ..$ : chr [1:6] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:6] 1.95 16.78 2.07 3.52 5 ...
##   ..- attr(*, "names")= chr [1:6] "score" "n.matches" "n.updates.photo" "n.photos" ...
##  $ scale   : Named num [1:6] 1.08 10.1 1.53 1.71 2.24 ...
##   ..- attr(*, "names")= chr [1:6] "score" "n.matches" "n.updates.photo" "n.photos" ...
##  $ x       : num [1:3000, 1:6] 0.312 6.163 0.504 2.11 0.186 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:6] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
pca_coords <- as.data.frame(tinder_pca$x[, 1:6])
fviz_nbclust(pca_coords, kmeans, method = "wss")  # Elbow method

I decided to use 6 clusters for the Gaussian Mixture and K-means model so that we get moderate variation between clusters, while maintaining a decent square error. At 6 clusters there is on average a squared error of ~3 matches.

# Load required libraries
library(FactoMineR)
library(factoextra)

# Perform K-means clustering
set.seed(42)  # Set seed for reproducibility
kmeans_result <- kmeans(pca_coords, centers = 6, nstart = 250)  # Adjust 'centers' as needed
## Warning: did not converge in 10 iterations
# View clustering results
print(kmeans_result)
## K-means clustering with 6 clusters of sizes 400, 613, 450, 599, 523, 415
## 
## Cluster means:
##          PC1        PC2         PC3         PC4         PC5         PC6
## 1  2.7450964 -0.2532806 -0.04801216  0.33038799  0.27276275 -0.01576899
## 2 -1.0223461 -0.3415785 -0.90078490  0.44907999  0.08422322 -0.01472560
## 3  0.4817928 -0.1383272 -0.84011420 -0.94089828 -0.28145885  0.01510575
## 4 -1.3370468  0.7940561  0.54462685 -0.07391694  0.14623527 -0.02822632
## 5  0.6174048  0.9186106  0.53412828  0.14272677 -0.08436620  0.02872047
## 6 -0.5064069 -1.4051224  0.82857065 -0.03471531 -0.18686471  0.02511702
## 
## Clustering vector:
##    [1] 3 1 5 3 5 6 2 4 4 1 5 4 2 3 1 3 6 3 2 2 6 4 2 3 4 1 2 3 2 4 2 4 5 3 4 6 4
##   [38] 4 3 3 2 3 6 4 6 1 3 4 2 4 6 6 6 2 1 1 1 1 2 4 5 4 2 3 2 2 3 3 2 3 2 6 6 3
##   [75] 3 4 3 1 4 3 6 4 2 6 5 5 6 6 1 4 4 5 4 3 4 4 6 6 3 5 2 3 2 3 6 6 4 2 3 5 2
##  [112] 5 6 6 5 4 6 4 6 3 6 3 3 2 5 1 2 3 5 1 5 3 2 4 5 6 3 5 1 5 2 6 5 4 2 1 1 1
##  [149] 5 6 3 2 6 1 2 4 5 1 2 1 5 2 5 6 1 2 1 1 2 2 2 5 3 1 3 2 3 2 5 1 3 4 1 3 4
##  [186] 4 6 2 2 6 4 2 1 2 6 2 6 6 3 6 4 5 2 5 6 2 3 1 2 1 4 6 1 4 4 5 4 2 3 6 6 6
##  [223] 1 3 1 1 3 3 2 1 4 4 2 6 2 4 4 1 4 6 4 5 3 2 2 3 4 1 1 3 4 3 2 6 6 2 3 4 1
##  [260] 6 3 1 3 5 3 1 3 5 2 6 6 2 1 4 4 2 5 4 4 6 6 5 2 6 1 3 4 1 2 5 4 2 5 6 5 2
##  [297] 3 3 6 2 5 1 1 5 5 5 1 2 1 6 2 2 2 2 2 3 2 5 4 3 4 6 4 3 1 5 4 2 2 3 4 1 5
##  [334] 4 2 2 4 1 4 3 5 1 1 4 2 4 6 1 5 5 2 3 4 2 5 2 4 5 2 1 2 2 1 5 5 2 3 5 4 4
##  [371] 5 2 3 2 4 3 2 5 5 5 5 6 5 5 4 1 4 5 6 5 6 4 1 5 4 1 5 2 4 1 6 2 6 4 6 6 2
##  [408] 5 1 6 4 2 2 4 5 6 2 4 4 5 2 3 5 4 1 5 5 4 3 5 1 4 4 5 5 2 3 1 6 2 5 3 6 3
##  [445] 2 1 6 3 5 6 6 1 1 3 3 1 5 6 1 4 6 5 5 2 6 5 2 2 4 1 2 4 5 6 4 2 4 4 2 1 5
##  [482] 4 3 4 5 3 4 2 3 4 6 3 5 3 6 1 5 2 5 2 2 3 2 6 3 3 3 2 3 6 6 6 5 3 2 2 2 4
##  [519] 5 4 2 5 6 2 4 4 4 1 4 5 3 5 4 2 3 1 1 2 6 2 4 2 3 5 6 4 3 2 3 6 5 3 3 2 4
##  [556] 6 6 5 4 2 4 5 4 5 2 2 1 2 6 5 3 4 1 4 4 4 1 1 4 2 3 6 4 2 2 4 1 2 2 1 1 2
##  [593] 2 5 1 2 4 3 2 4 3 2 3 4 5 4 4 6 4 3 1 2 2 3 2 2 5 1 1 6 5 4 6 3 3 3 2 4 1
##  [630] 2 6 5 3 4 2 1 2 5 3 6 6 4 1 4 3 4 3 4 6 3 4 5 6 4 4 2 4 6 2 5 6 1 6 5 6 6
##  [667] 4 5 4 5 3 2 3 2 1 4 1 5 4 1 1 2 4 4 5 4 2 1 1 3 2 4 3 4 3 3 4 6 2 1 4 3 2
##  [704] 4 6 4 4 2 6 2 2 4 6 4 4 5 6 2 1 3 2 6 3 4 5 5 5 3 5 2 1 5 1 6 4 4 1 1 1 4
##  [741] 5 2 5 4 4 6 2 2 4 4 5 4 4 5 3 2 4 5 2 1 3 6 3 2 3 3 5 2 5 2 3 4 5 6 5 3 1
##  [778] 3 6 4 1 3 5 1 2 5 1 6 5 2 2 6 5 1 4 5 6 4 5 2 4 2 5 4 2 6 1 2 2 6 5 3 1 4
##  [815] 6 1 1 6 1 4 5 5 4 4 4 2 4 3 2 6 5 1 6 1 2 5 1 5 2 5 2 4 5 1 1 5 3 4 2 2 2
##  [852] 5 1 4 6 5 1 4 4 3 2 4 2 5 1 5 4 1 4 2 2 2 4 1 6 6 4 2 2 2 2 4 2 5 2 4 6 5
##  [889] 3 6 1 1 6 2 2 6 4 3 6 3 6 2 1 4 3 1 5 1 2 1 5 2 5 4 4 1 3 4 2 4 3 4 4 5 4
##  [926] 6 5 2 3 2 5 2 6 5 4 5 1 5 5 3 4 5 1 5 4 4 3 1 3 5 3 1 6 3 5 5 2 4 1 2 2 6
##  [963] 4 4 3 6 5 3 5 4 1 5 4 4 4 2 4 1 6 2 5 5 1 2 3 4 4 5 5 2 2 2 5 5 6 5 4 5 2
## [1000] 2 1 5 6 5 3 6 6 4 4 2 2 3 4 6 6 6 5 1 1 3 5 4 2 5 2 2 1 6 4 6 6 4 4 4 2 1
## [1037] 6 6 4 4 3 5 5 5 2 3 2 1 1 6 4 3 5 6 4 1 1 5 5 4 6 4 5 3 2 1 2 6 3 6 2 2 2
## [1074] 6 1 2 2 2 3 2 4 3 5 1 5 2 1 4 5 4 2 6 5 2 5 2 5 3 3 5 4 5 1 1 5 6 6 6 2 1
## [1111] 1 2 2 2 1 2 6 1 5 1 6 2 4 6 4 5 3 3 6 2 1 2 5 4 6 2 5 3 6 4 4 4 3 1 2 2 3
## [1148] 1 5 4 3 5 4 1 4 5 4 3 3 6 5 6 6 5 6 4 4 4 3 4 2 4 5 5 5 1 1 2 6 3 4 6 2 1
## [1185] 4 6 6 2 5 6 5 4 2 4 2 3 1 6 3 2 4 5 1 4 1 5 5 2 3 2 2 1 3 6 5 4 3 2 2 1 6
## [1222] 2 4 2 1 5 3 6 2 4 2 2 4 4 6 4 4 3 2 4 5 5 4 2 4 1 5 2 2 4 3 3 6 2 6 2 5 3
## [1259] 4 5 2 6 5 4 5 5 3 5 3 1 6 4 3 5 6 1 5 4 3 1 6 6 4 5 4 6 5 5 6 3 4 5 2 1 6
## [1296] 6 5 1 4 4 3 5 3 4 2 4 3 2 4 2 5 4 5 3 5 3 2 3 6 6 4 4 2 2 5 5 2 5 3 1 4 4
## [1333] 2 4 1 3 3 2 6 4 2 1 4 5 2 5 5 4 2 1 4 5 6 4 5 4 2 2 4 4 1 2 5 6 4 1 1 5 4
## [1370] 6 6 1 3 5 3 6 2 2 3 1 4 3 3 5 6 2 4 4 2 3 4 4 6 5 2 2 3 5 6 3 2 1 2 6 6 2
## [1407] 2 2 6 5 2 5 2 4 4 5 6 4 2 2 5 5 5 6 6 3 1 4 5 5 5 1 4 5 2 5 1 6 6 1 6 6 3
## [1444] 2 6 3 3 5 4 4 2 5 5 4 5 2 3 5 3 2 1 5 3 6 3 5 4 2 4 5 4 5 2 2 2 1 5 6 2 2
## [1481] 1 3 2 6 1 2 6 3 6 4 5 4 2 2 3 6 3 2 3 1 1 2 2 1 3 2 2 2 5 3 5 1 2 3 5 2 5
## [1518] 2 4 5 6 3 1 2 2 3 1 5 6 1 1 3 3 4 3 3 3 6 2 3 2 3 4 5 4 5 2 1 2 1 4 1 4 6
## [1555] 4 3 2 4 6 5 6 4 3 4 2 5 2 4 4 2 2 5 3 3 5 4 1 3 4 4 5 2 1 2 2 4 6 1 6 1 6
## [1592] 1 2 2 4 4 4 1 2 2 4 5 4 4 1 5 6 6 5 4 5 4 4 3 4 4 1 4 2 1 2 5 5 1 3 6 4 4
## [1629] 5 3 1 5 5 2 5 2 2 6 4 3 5 5 6 4 3 1 3 1 3 4 4 4 3 2 4 1 3 5 5 4 3 6 4 6 3
## [1666] 3 4 4 1 3 3 2 3 2 3 3 3 2 1 5 1 4 5 1 5 1 2 4 6 1 1 2 4 2 5 2 5 2 6 1 3 2
## [1703] 1 4 3 4 2 3 4 1 3 4 5 6 6 4 3 6 4 5 2 1 5 6 3 5 5 6 1 2 6 1 6 4 6 2 4 4 1
## [1740] 1 5 6 2 4 5 1 5 1 1 6 4 1 1 4 5 4 5 6 4 5 5 4 5 6 4 4 5 4 1 1 5 2 4 4 6 5
## [1777] 2 4 6 4 1 2 2 1 1 3 3 2 4 4 5 2 4 1 1 1 3 2 5 4 6 2 4 3 5 4 4 5 1 5 5 5 5
## [1814] 2 2 5 2 4 3 5 3 2 4 5 4 2 2 4 4 3 4 5 6 2 3 4 4 2 2 4 3 5 4 3 4 2 2 2 5 1
## [1851] 6 3 5 3 6 3 6 2 5 2 3 2 3 2 3 5 5 6 2 1 6 4 3 3 4 5 4 1 4 6 5 2 6 5 6 2 4
## [1888] 2 3 1 1 3 1 4 1 2 6 4 1 6 3 2 3 2 6 2 4 4 2 6 4 4 4 4 5 2 2 5 3 1 5 6 3 5
## [1925] 2 5 6 6 5 2 6 3 6 1 2 4 1 4 2 4 3 6 2 2 3 5 5 2 2 5 5 4 1 5 6 3 2 2 2 5 3
## [1962] 6 3 2 2 3 2 6 4 6 5 3 6 6 5 2 5 1 1 2 4 1 6 2 6 4 1 2 3 5 3 6 4 3 4 3 2 2
## [1999] 3 4 3 3 5 2 4 4 2 4 4 2 6 1 6 1 5 6 2 1 5 3 2 4 5 3 2 1 3 4 3 6 5 5 4 3 6
## [2036] 2 1 1 5 6 6 3 3 5 3 2 4 6 3 6 1 1 1 3 6 5 3 4 3 4 5 6 3 2 3 6 2 5 3 3 4 4
## [2073] 5 4 5 3 2 6 4 4 6 1 6 3 5 5 2 2 4 2 3 4 6 3 3 2 1 4 4 1 2 1 4 1 4 2 6 6 3
## [2110] 4 5 1 1 6 3 2 5 3 3 2 6 2 1 5 1 5 4 2 6 2 2 2 1 4 1 3 5 6 2 5 2 2 2 6 4 2
## [2147] 4 6 5 1 6 4 5 3 2 2 1 2 5 3 3 5 1 5 4 2 2 6 6 2 5 4 5 6 2 4 4 2 3 6 2 6 2
## [2184] 4 2 6 4 5 5 4 6 1 3 3 2 4 4 6 4 6 2 2 2 2 4 6 1 4 1 6 5 4 3 5 3 6 5 3 4 5
## [2221] 6 4 4 2 3 4 4 4 5 2 4 6 6 6 6 4 4 2 3 6 3 3 2 2 3 4 6 3 1 6 4 2 4 1 6 3 4
## [2258] 4 3 1 6 5 6 4 3 3 1 6 3 1 6 5 5 3 1 1 3 2 4 3 3 1 2 2 1 6 1 6 2 3 3 1 1 2
## [2295] 4 3 4 5 2 3 5 2 6 1 4 2 3 5 4 2 2 2 3 4 5 4 1 1 5 1 5 6 4 5 6 6 5 5 4 2 2
## [2332] 1 2 6 2 6 4 3 2 4 3 6 4 2 4 6 5 3 5 4 4 4 3 5 2 1 3 5 6 6 4 5 1 2 3 3 6 2
## [2369] 2 1 3 1 3 3 5 3 4 5 5 1 5 5 4 5 1 2 3 6 3 1 3 4 4 2 2 1 5 5 2 3 2 6 1 2 5
## [2406] 5 1 4 3 1 4 2 1 5 3 2 1 2 2 5 4 4 4 1 5 2 5 3 2 4 4 2 1 4 6 3 5 4 2 3 2 1
## [2443] 2 6 6 5 5 6 3 2 4 4 4 2 2 4 4 1 5 1 4 6 1 3 2 3 2 2 5 5 1 2 5 1 6 4 2 1 4
## [2480] 6 3 4 6 5 5 2 4 6 1 1 4 3 5 2 2 5 5 5 2 1 6 2 3 6 4 5 1 3 4 2 3 5 4 4 2 6
## [2517] 5 4 3 3 4 4 1 3 4 5 1 3 3 2 2 1 6 6 6 6 5 2 2 4 5 3 5 4 2 6 2 6 4 1 4 3 3
## [2554] 3 3 3 3 3 2 6 4 2 5 2 4 6 2 1 4 2 5 1 4 1 4 6 5 5 4 6 2 1 2 4 5 6 3 3 1 4
## [2591] 3 5 3 3 2 3 2 4 6 2 3 5 4 4 3 4 4 4 4 6 2 5 5 4 2 5 3 2 2 1 1 2 5 5 6 2 2
## [2628] 4 2 4 3 4 4 6 5 4 2 6 4 2 1 2 6 5 4 4 4 5 2 2 3 6 1 1 2 2 1 2 3 2 2 3 5 5
## [2665] 5 1 5 3 5 1 5 2 3 4 6 4 2 2 4 5 2 2 5 2 1 2 2 1 5 4 6 4 6 2 3 4 6 3 6 3 2
## [2702] 4 1 4 1 6 6 1 2 4 3 6 3 1 2 2 3 5 5 1 3 1 1 5 5 5 5 1 3 1 5 5 6 1 1 3 6 3
## [2739] 2 2 3 3 1 4 5 5 5 1 2 1 3 5 6 1 2 2 1 3 1 5 5 1 3 1 5 3 4 5 1 1 4 4 2 4 3
## [2776] 1 6 3 4 1 5 6 2 6 3 2 4 4 6 3 2 4 5 1 6 6 1 3 5 2 6 2 2 3 1 2 1 6 5 4 2 1
## [2813] 5 2 6 2 6 3 2 4 4 2 6 4 1 4 5 2 3 4 4 4 3 2 5 1 5 4 2 3 6 5 6 5 6 4 3 3 2
## [2850] 5 6 5 1 4 3 3 5 5 4 2 3 1 5 5 1 5 4 4 2 5 6 2 5 6 4 4 1 5 3 2 2 4 2 5 4 6
## [2887] 5 4 5 2 3 1 4 6 4 2 5 4 4 2 2 5 2 6 4 3 6 6 6 5 3 3 2 3 6 6 5 3 3 2 2 1 6
## [2924] 5 5 6 6 2 4 4 6 5 4 4 1 3 1 3 5 2 4 2 5 4 5 4 3 2 5 2 5 4 5 2 1 5 4 6 1 6
## [2961] 4 4 1 2 4 3 4 2 2 1 5 2 4 1 5 4 3 5 6 5 2 1 5 5 2 4 3 4 2 2 5 3 1 3 2 3 5
## [2998] 1 1 5
## 
## Within cluster sum of squares by cluster:
## [1] 1877.886 1363.571 1453.163 1463.862 1423.827 1421.426
##  (between_SS / total_SS =  50.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Add clusters to the PCA result for visualization
fviz_pca_ind(
  tinder_pca,
  geom.ind = "point",        # Use points to represent individuals
  col.ind = as.factor(kmeans_result$cluster),  # Color by clusters
  palette = "Set2",          # Use a color palette
  addEllipses = TRUE,        # Add confidence ellipses for clusters
  legend.title = "Clusters"  # Title for the legend
) +
ggtitle("K-Means Clustering on PCA Coordinates")

gmm_result <- Mclust(pca_coords, G = 4)  # assigning 4 clusters
pca_coords$Cluster <- as.factor(gmm_result$classification)  # For GMM

Comparing to another clustering method…

ggplot(pca_coords, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 2, alpha = 0.6) +
  stat_ellipse(aes(fill = Cluster), alpha = 0.2, geom = "polygon") +
  scale_color_brewer(palette = "Set1") +  # Color scheme
  scale_fill_brewer(palette = "Set1") +   # Matching fill colors for ellipses
  labs(
    title = "Gaussian Mixture Model Clusters with Ellipses",
    x = "Principal Component 1 (PC1)",
    y = "Principal Component 2 (PC2)",
    color = "Cluster",
    fill = "Cluster"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "right"
  )

Now we can plot the average measurements for each cluster…

cluster_summary <- tinder_quantitative %>%
  group_by(tinder_quantitative$Cluster) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE))%>%
  arrange(desc(n.matches))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
print(cluster_summary)
## # A tibble: 1 × 6
##   score n.matches n.updates.photo n.photos sent.ana length.prof
##   <dbl>     <dbl>           <dbl>    <dbl>    <dbl>       <dbl>
## 1  1.95      16.8            2.07     3.52     5.00        16.1

These clusters reinforce that the only quantitative factors (from this dataset) that can be used to linearly predict the # of matches of a user is sent.ana, and n.updates.photo. However, it seems that there may be a non linear relationship between n.photos and score.